import os
import sys
import gzip
from collections import defaultdict
import pybedtools
from numpy import *


def read_gencode_genes():
    path = "/osc-fs_home/scratch/mdehoon/Data/Gencode/gencode.v34.annotation.gtf.gz"
    print("Reading %s" % path)
    stream = gzip.open(path, "rt")
    records = pybedtools.BedTool(stream)
    gene_names = {}
    for record in records:
        gene_id, version = record.attrs['gene_id'].split(".")
        gene_name = record.attrs['gene_name']
        gene_names[gene_id] = gene_name
    stream.close()
    return gene_names

def read_fantomcat(gene_names):
    path = "/osc-fs_home/mdehoon/Data/Fantom6/FANTOMCAT/F6_CAT.promoter.1_to_1_ID_mapping.tsv"
    print("Reading %s" % path)
    stream = open(path)
    line = next(stream)
    genes = defaultdict(set)
    for line in stream:
        words = line.split()
        assert len(words) == 8
        promoter = words[0]
        geneID = words[5]
        genetype = words[7]
        if genetype != "protein_coding":
            continue
        gene = gene_names.get(geneID)
        if gene is None:
            continue
        genes[promoter].add(gene)
    stream.close()
    return genes

def read_expression(genes):
    filename = "promoters.FANTOM_CAT.THP-1.counts.txt"
    stream = open(filename)
    line = next(stream)
    words = line.split()
    assert len(words) == 17
    assert words[0] == "promoter"
    samples = words[1:]
    assert samples[0] == "00_hr_A"
    assert samples[1] == "00_hr_C"
    assert samples[2] == "00_hr_G"
    assert samples[3] == "00_hr_H"
    assert samples[4] == "01_hr_A"
    assert samples[5] == "01_hr_C"
    assert samples[6] == "01_hr_G"
    assert samples[7] == "04_hr_C"
    assert samples[8] == "04_hr_E"
    assert samples[9] == "12_hr_A"
    assert samples[10] == "12_hr_C"
    assert samples[11] == "24_hr_C"
    assert samples[12] == "24_hr_E"
    assert samples[13] == "96_hr_A"
    assert samples[14] == "96_hr_C"
    assert samples[15] == "96_hr_E"
    expression = defaultdict(lambda: zeros(16))
    for line in stream:
        words = line.split()
        assert len(words) == 17
        promoter = words[0]
        for gene in genes[promoter]:
            expression[gene] += array(words[1:], float)
    stream.close()
    return samples, expression


gene_names = read_gencode_genes()
genes = read_fantomcat(gene_names)

samples, expression = read_expression(genes)

genes = sorted(expression.keys())
filename = "genes.FANTOM_CAT.THP-1.counts.txt"
print("Writing", filename)
stream = open(filename, 'w')
stream.write("gene")
for sample in samples:
    stream.write("\t%s" % sample)
stream.write("\n")
for gene in genes:
    stream.write(gene)
    for value in expression[gene]:
        stream.write("\t%g" % value)
    stream.write("\n")
stream.close()
